{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# MLBootcamp  V"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "This code shows 7th place solution for competition of forecasting cardiovascular diseases (http://mlbootcamp.ru/round/12/sandbox/)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "%matplotlib inline\n",
    "\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "import xgboost as xgb\n",
    "import tqdm\n",
    "import hyperopt\n",
    "import sys\n",
    "import scipy\n",
    "\n",
    "import lightgbm\n",
    "from catboost import CatBoost, CatBoostClassifier\n",
    "\n",
    "from datetime import datetime\n",
    "from sklearn.cross_validation import cross_val_score, KFold\n",
    "from sklearn.metrics import log_loss"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Data can be obtained from competion page, or through [this link](http://mlbootcamp.ru/media/condition/ml5.zip)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_df = pd.read_csv('../ml5/train.csv', sep=';')\n",
    "test_df = pd.read_csv('../ml5/test.csv', sep=';', na_values='None')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "##  Data exploration"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(70000, 13) (30000, 12)\n"
     ]
    }
   ],
   "source": [
    "print train_df.shape, test_df.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/place/home/d-kruchinin/anaconda2/lib/python2.7/site-packages/ipykernel_launcher.py:1: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version\n",
      "of pandas will change to not sort by default.\n",
      "\n",
      "To accept the future behavior, pass 'sort=True'.\n",
      "\n",
      "To retain the current behavior and silence the warning, pass sort=False\n",
      "\n",
      "  \"\"\"Entry point for launching an IPython kernel.\n"
     ]
    }
   ],
   "source": [
    "full_df = pd.concat([train_df, test_df], axis=0)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#####  height fix"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<matplotlib.axes._subplots.AxesSubplot at 0x7f3f4b9ff810>"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAZUAAAD8CAYAAAC/1zkdAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvhp/UCwAAFsxJREFUeJzt3X+03HV95/Hny6BUrBow0aUJNrFN6aKnrpgiu7burlgIoIR2tQvHU3MsbfYHrrrunhXqnuLR0gPbVpRTf5RK1kBVRNSSXbExotXuOfIjIvLbJgUWIhRSg2CLQoPv/WM+152EuTdzb74zcy95Ps6ZM9/vez4z857v/ZIX3x/znVQVkiR14WmTbkCS9NRhqEiSOmOoSJI6Y6hIkjpjqEiSOmOoSJI6Y6hIkjpjqEiSOmOoSJI6c9CkGxi3JUuW1IoVKybdhiQtGEuWLGHz5s2bq2rNvsYecKGyYsUKtm7dOuk2JGlBSbJkmHHu/pIkdcZQkSR1xlCRJHXGUJEkdcZQkSR1xlCRJHXGUJEkdcZQkSR1xlCRJHXmgPtGvXQgWXHW5388ffd5J0+wEx0o3FKRJHXGUJEkdcZQkSR1xlCRJHXGUJEkdcZQkSR1xlCRJHXGUJEkdcZQkSR1xlCRJHXGUJEkdcZQkSR1xlCRJHXGUJEkdWZkoZJkQ5IHk9zSV/uDJHckuSnJ55Is7nvs7CTbk3w7yQl99TWttj3JWX31lUmuTbItyaeSPGNUn0WSNJxRbql8DFizV20L8JKq+gXgr4GzAZIcBZwGvLg950NJFiVZBHwQOBE4Cji9jQU4H7igqlYBDwFnjPCzSJKGMLJQqaqvAbv2qn2xqna32WuA5W16LXBZVT1WVXcB24Fj2m17Vd1ZVY8DlwFrkwR4NXBFe/5G4NRRfRZJ0nAmeUzlN4EvtOllwL19j+1otenqzwO+1xdQU/WBkqxPsjXJ1p07d3bUviRpbxMJlSTvAnYDH58qDRhWc6gPVFUXVdXqqlq9dOnS2bYrSRrS2H+jPsk64LXAcVU1FQQ7gCP6hi0H7mvTg+p/ByxOclDbWukfL0makLFuqSRZA7wTOKWqHu17aBNwWpKDk6wEVgHXAdcDq9qZXs+gdzB/UwujrwCvb89fB1w5rs8hSRpslKcUfxL4OnBkkh1JzgD+GHg2sCXJjUk+AlBVtwKXA7cBfwGcWVVPtK2QtwCbgduBy9tY6IXTO5Jsp3eM5eJRfRZJ0nBGtvurqk4fUJ72H/6qOhc4d0D9KuCqAfU76Z0dJkmaJ/xGvSSpM4aKJKkzhookqTOGiiSpM4aKJKkzhookqTOGiiSpM4aKJKkzhookqTOGiiSpM4aKJKkzhookqTOGiiSpM4aKJKkzhookqTOGiiSpM4aKJKkzhookqTOGiiSpM4aKJKkzhookqTMHTboBSeO34qzP7zF/93knT6gTPdWMbEslyYYkDya5pa92WJItSba1+0NbPUkuTLI9yU1Jju57zro2fluSdX31lye5uT3nwiQZ1WeRJA1nlLu/Pgas2at2FnB1Va0Crm7zACcCq9ptPfBh6IUQcA7wCuAY4JypIGpj1vc9b+/3kiSN2chCpaq+Buzaq7wW2NimNwKn9tUvqZ5rgMVJDgdOALZU1a6qegjYAqxpjz2nqr5eVQVc0vdakqQJGfeB+hdU1f0A7f75rb4MuLdv3I5Wm6m+Y0BdkjRB8+Xsr0HHQ2oO9cEvnqxPsjXJ1p07d86xRUnSvow7VB5ou65o9w+2+g7giL5xy4H79lFfPqA+UFVdVFWrq2r10qVL9/tDSJIGG3eobAKmzuBaB1zZV39TOwvsWODhtntsM3B8kkPbAfrjgc3tse8nObad9fWmvteSJE3IyL6nkuSTwL8CliTZQe8srvOAy5OcAdwDvKENvwo4CdgOPAq8GaCqdiV5L3B9G/eeqpo6+P8f6J1h9kzgC+0mSZqgkYVKVZ0+zUPHDRhbwJnTvM4GYMOA+lbgJfvToySpW/PlQL0k6SnAUJEkdcZrf0lPAf3X8vI6Xpokt1QkSZ0xVCRJnTFUJEmdMVQkSZ0xVCRJnTFUJEmdMVQkSZ0xVCRJnTFUJEmdMVQkSZ0xVCRJnTFUJEmdMVQkSZ0xVCRJnfHS95K8dL4645aKJKkzQ4VKEn8LXpK0T8NuqXwkyXVJ/mOSxSPtSJK0YA0VKlX1S8AbgSOArUk+keRXRtqZJGnBGfqYSlVtA/478E7gXwIXJrkjya+NqjlJ0sIy1NlfSX4BeDNwMrAFeF1V3ZDkp4CvA5+dzZsm+c/AbwEF3Nxe+3DgMuAw4AbgN6rq8SQHA5cALwe+C/zbqrq7vc7ZwBnAE8Bbq2rzbPqQnor6z+SSxm3YLZU/pvcP/Uur6syqugGgqu6jt/UytCTLgLcCq6vqJcAi4DTgfOCCqloFPEQvLGj3D1XVzwIXtHEkOao978XAGuBDSRbNphdJUreGDZWTgE9U1Q8AkjwtySEAVXXpHN73IOCZSQ4CDgHuB14NXNEe3wic2qbXtnna48clSatfVlWPVdVdwHbgmDn0IknqyLCh8iXgmX3zh7TarFXVd4A/BO6hFyYPA98AvldVu9uwHcCyNr0MuLc9d3cb/7z++oDn7CHJ+iRbk2zduXPnXNqWJA1h2FD5iar6+6mZNn3IXN4wyaH0tjJWAj8FPAs4ccDQmnrKNI9NV39yseqiqlpdVauXLl06+6YlSUMZNlT+IcnRUzNJXg78YI7v+RrgrqraWVX/SO8g/78AFrfdYQDLgfva9A56pzLTHn8usKu/PuA5kqQJGDZU3g58OslfJfkr4FPAW+b4nvcAxyY5pB0bOQ64DfgK8Po2Zh1wZZve1OZpj3+5qqrVT0tycJKVwCrgujn2JEnqwFCnFFfV9Ul+HjiS3m6nO9pWxqxV1bVJrqB3Ntlu4JvARcDngcuS/F6rXdyecjFwaZLt9LZQTmuvc2uSy+kF0m7gzKp6Yi49SZK6MZurFP8isKI952VJqKpL5vKmVXUOcM5e5TsZcPZWVf0QeMM0r3MucO5cepAkdW/YLz9eCvwMcCO9LxpC76D4nEJF0v7zS46aj4bdUlkNHNWOZUiSNNCwB+pvAf7JKBuRJC18w26pLAFuS3Id8NhUsapOGUlXkqQFadhQefcom5AkPTUMe0rxV5P8NLCqqr7UrvvlxRslSXsY9ueEf5vexRz/pJWWAX8+qqYkSQvTsAfqzwReCTwCP/7BruePqilJ0sI0bKg8VlWPT820a3B5erEkaQ/DhspXk/wOvd9A+RXg08D/Gl1bkqSFaNhQOQvYSe+nf/8dcBWz/MVHSdJT37Bnf/0I+NN2kyRpoGGv/XUXA46hVNWLOu9IkrRgzebaX1N+gt5Vgw/rvh1J0kI21DGVqvpu3+07VfV+4NUj7k2StMAMu/vr6L7Zp9Hbcnn2SDqSJC1Yw+7++qO+6d3A3cCvd96NJGlBG/bsr3896kYkSQvfsLu/3jHT41X1vm7akTST/fm1R38pUuMwm7O/fhHY1OZfB3wNuHcUTUmSFqbZ/EjX0VX1fYAk7wY+XVW/NarGJEkLz7CXaXkh8Hjf/OPAis67kSQtaMOGyqXAdUneneQc4Frgkrm+aZLFSa5IckeS25P88ySHJdmSZFu7P7SNTZILk2xPclP/6c1J1rXx25Ksm2s/kqRuDPvlx3OBNwMPAd8D3lxVv78f7/sB4C+q6ueBlwK307to5dVVtQq4us0DnAisarf1wIcBkhwGnAO8AjgGOGcqiCRJkzHslgrAIcAjVfUBYEeSlXN5wyTPAV4FXAxQVY9X1feAtcDGNmwjcGqbXgtcUj3XAIuTHA6cAGypql1V9RCwBVgzl54kSd0Y9ueEzwHeCZzdSk8H/myO7/kiepfR/59Jvpnko0meBbygqu4HaPdTvyy5jD3PMtvRatPVJUkTMuyWyq8CpwD/AFBV9zH3y7QcBBwNfLiqXtZe86wZxmdArWaoP/kFkvVJtibZunPnztn2K0ka0rCh8nhVFe0f7bZlMVc7gB1VdW2bv4JeyDzQdmvR7h/sG39E3/OXA/fNUH+SqrqoqlZX1eqlS5fuR+uSpJkMGyqXJ/kTesczfhv4EnP8wa6q+lvg3iRHttJxwG30vlg5dQbXOuDKNr0JeFM7C+xY4OG2e2wzcHySQ9sB+uNbTZI0IcNe++sP22/TPwIcCfxuVW3Zj/f9T8DHkzwDuJPemWVPoxdeZwD30PvNFuj9dPFJwHbg0TaWqtqV5L3A9W3ce6pq1370JEnaT/sMlSSLgM1V9Rp6Z1jtt6q6kT1/+GvKcQPGFnDmNK+zAdjQRU+SpP23z91fVfUE8GiS546hH0nSAjbstb9+CNycZAvtDDCAqnrrSLqSJC1Iw4bK59tNkqRpzRgqSV5YVfdU1caZxkmSBPs+pvLnUxNJPjPiXiRJC9y+QqX/W+svGmUjkqSFb1+hUtNMS5L0JPs6UP/SJI/Q22J5ZpumzVdVPWek3UmSFpQZQ6WqFo2rEUnSwjeb31ORJGlGhookqTOGiiSpM4aKJKkzw16mRdIBaMVZ///qTHefd/IEO9FC4ZaKJKkzhookqTPu/pLmuf5dUNJ855aKJKkzhookqTPu/pK0B3e3aX+4pSJJ6oyhIknqzMRCJcmiJN9M8r/b/Mok1ybZluRTSZ7R6ge3+e3t8RV9r3F2q387yQmT+SSSpCmT3FJ5G3B73/z5wAVVtQp4CDij1c8AHqqqnwUuaONIchRwGvBiYA3woSReql+SJmgioZJkOXAy8NE2H+DVwBVtyEbg1Da9ts3THj+ujV8LXFZVj1XVXcB24JjxfAJJ0iCT2lJ5P/DfgB+1+ecB36uq3W1+B7CsTS8D7gVojz/cxv+4PuA5kqQJGHuoJHkt8GBVfaO/PGBo7eOxmZ6z93uuT7I1ydadO3fOql9J0vAmsaXySuCUJHcDl9Hb7fV+YHGSqe/NLAfua9M7gCMA2uPPBXb11wc8Zw9VdVFVra6q1UuXLu3200iSfmzsoVJVZ1fV8qpaQe9A+5er6o3AV4DXt2HrgCvb9KY2T3v8y1VVrX5aOztsJbAKuG5MH0OSNMB8+kb9O4HLkvwe8E3g4la/GLg0yXZ6WyinAVTVrUkuB24DdgNnVtUT429bkjRloqFSVX8J/GWbvpMBZ29V1Q+BN0zz/HOBc0fXoSRpNvxGvSSpM4aKJKkzhookqTOGiiSpM4aKJKkzhookqTOGiiSpM/Ppy4+SGn/SVwuVWyqSpM4YKpKkzhgqkqTOGCqSpM4YKpKkzhgqkqTOGCqSpM4YKpKkzhgqkqTOGCqSpM4YKpKkzhgqkqTOGCqSpM54lWJpnlhIVybu7/Xu806eYCeabwwVSUNZSKGnyRn77q8kRyT5SpLbk9ya5G2tfliSLUm2tftDWz1JLkyyPclNSY7ue611bfy2JOvG/VkkSXuaxDGV3cB/qap/ChwLnJnkKOAs4OqqWgVc3eYBTgRWtdt64MPQCyHgHOAVwDHAOVNBJEmajLGHSlXdX1U3tOnvA7cDy4C1wMY2bCNwapteC1xSPdcAi5McDpwAbKmqXVX1ELAFWDPGjyJJ2stEz/5KsgJ4GXAt8IKquh96wQM8vw1bBtzb97QdrTZdfdD7rE+yNcnWnTt3dvkRJEl9JhYqSX4S+Azw9qp6ZKahA2o1Q/3JxaqLqmp1Va1eunTp7JuVJA1lIqGS5On0AuXjVfXZVn6g7dai3T/Y6juAI/qevhy4b4a6JGlCJnH2V4CLgdur6n19D20Cps7gWgdc2Vd/UzsL7Fjg4bZ7bDNwfJJD2wH641tNkjQhk/ieyiuB3wBuTnJjq/0OcB5weZIzgHuAN7THrgJOArYDjwJvBqiqXUneC1zfxr2nqnaN5yNIkgYZe6hU1f9h8PEQgOMGjC/gzGleawOwobvuJEn7w2t/SZI6Y6hIkjrjtb8k7RcvLql+bqlIkjpjqEiSOmOoSJI6Y6hIkjpjqEiSOmOoSJI6Y6hIkjpjqEiSOuOXH6UJ6v/ioPRU4JaKJKkzhookqTOGiiSpMx5TkdQZLy4pt1QkSZ1xS0UaM8/40lOZoSJpJNwVdmBy95ckqTOGiiSpM+7+ksbA4yg6UCz4UEmyBvgAsAj4aFWdN+GWJO3F4ysHjgW9+yvJIuCDwInAUcDpSY6abFeSdOBa6FsqxwDbq+pOgCSXAWuB2ybalQ5Y7ubaN7dantoWeqgsA+7tm98BvGJCvWie2N9/tIZ5vuHRjemWo2GzcC30UMmAWj1pULIeWN9m/z7Jt+f4fkuAv5vjc0fJvqaR8weWh+5rmuePysSX1zTG3teQy93lNTv709fQz1voobIDOKJvfjlw396Dquoi4KL9fbMkW6tq9f6+Ttfsa3bsa3bsa3YO9L4W9IF64HpgVZKVSZ4BnAZsmnBPknTAWtBbKlW1O8lbgM30TineUFW3TrgtSTpgLehQAaiqq4CrxvR2+70LbUTsa3bsa3bsa3YO6L5S9aTj2pIkzclCP6YiSZpHDJVpJLk7yc1JbkyytdUOS7IlybZ2f+iYezqy9TN1eyTJ25O8O8l3+uonjamfDUkeTHJLX23gMkrPhUm2J7kpydFj7usPktzR3vtzSRa3+ookP+hbdh8Zc1/T/u2SnN2W17eTnDDmvj7V19PdSW5s9XEuryOSfCXJ7UluTfK2Vp/YOjZDT/Nh/Zqut/GuY1XlbcANuBtYslftfwBntemzgPMn2N8i4G+BnwbeDfzXCfTwKuBo4JZ9LSPgJOAL9L5bdCxw7Zj7Oh44qE2f39fXiv5xE1heA/929C479C3gYGAl8DfAonH1tdfjfwT87gSW1+HA0W362cBft+UysXVshp7mw/o1XW9jXcfcUpmdtcDGNr0ROHWCvRwH/E1V/d9JNVBVXwN27VWebhmtBS6pnmuAxUkOH1dfVfXFqtrdZq+h952msZpmeU1nLXBZVT1WVXcB2+ldlmisfSUJ8OvAJ0fx3jOpqvur6oY2/X3gdnpX0ZjYOjZdT/Nk/ZpueU1nJOuYoTK9Ar6Y5BvpfSMf4AVVdT/0/oDA8yfWXe87Of3/ob+lbXpvGPduub1Mt4wGXVJnphV+lH6T3v/RTlmZ5JtJvprklyfQz6C/3XxZXr8MPFBV2/pqY19eSVYALwOuZZ6sY3v11G/i69eA3sa2jhkq03tlVR1N7wrIZyZ51aQbmpLeFz1PAT7dSh8Gfgb4Z8D99HZXzDdDXVJn5E0k7wJ2Ax9vpfuBF1bVy4B3AJ9I8pwxtjTd325eLC/gdPb8n5exL68kPwl8Bnh7VT0y09ABtZEss+l6mg/r14DexrqOGSrTqKr72v2DwOfobRY+MLU53e4fnFB7JwI3VNUDrccHquqJqvoR8KeMaDfJkKZbRkNdUmeUkqwDXgu8sdpO5bbp/902/Q16+5V/blw9zfC3mw/L6yDg14BPTdXGvbySPJ3eP5Afr6rPtvJE17FpepoX69eg3sa9jhkqAyR5VpJnT03TOwh3C71LwKxrw9YBV06mwz3/73Gv/ca/Sq/XSZluGW0C3tTO0DkWeHhqF8Y4pPdjbu8ETqmqR/vqS9P7XR6SvAhYBdw5xr6m+9ttAk5LcnCSla2v68bVV/Ma4I6q2jFVGOfyasdzLgZur6r39T00sXVsup7mw/o1Q2/jXcfGcVbCQrsBL6J3VsS3gFuBd7X684CrgW3t/rAJ9HYI8F3guX21S4GbgZvainL4mHr5JL3N6X+k9389Z0y3jOhtan+Q3v+p3QysHnNf2+ntP76x3T7Sxv6b9jf+FnAD8Lox9zXt3w54V1te3wZOHGdfrf4x4N/vNXacy+uX6O2Ouanv73bSJNexGXqaD+vXdL2NdR3zG/WSpM64+0uS1BlDRZLUGUNFktQZQ0WS1BlDRZLUGUNFktQZQ0WS1BlDRZLUmf8Hq7azCdYkslQAAAAASUVORK5CYII=\n",
      "text/plain": [
       "<Figure size 432x288 with 1 Axes>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "full_df.height.plot.hist(100)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "full_df.loc[(full_df.height < 100), 'height'] += 100"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#####  arterial pressure fix"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "full_df['ap_hi'] = full_df.ap_hi.abs()\n",
    "full_df['ap_lo'] = full_df.ap_lo.abs()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([ 80., 215.])"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "np.percentile(full_df.ap_hi, [0.3, 99.9])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([ 60., 110.])"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "np.percentile(full_df.ap_lo, [1, 98])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "while np.any(full_df.ap_hi > 800):\n",
    "    full_df.loc[full_df.ap_hi > 800, 'ap_hi'] /= 10"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "for i in range(3):\n",
    "    full_df.loc[full_df.ap_hi < 22, 'ap_hi'] *= 10"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "while np.any(full_df.ap_lo > 600):\n",
    "    full_df.loc[full_df.ap_lo > 600, 'ap_lo'] /= 10"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "for i in range(3):\n",
    "    full_df.loc[full_df.ap_lo < 11, 'ap_lo'] *= 10"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "##  Adding features"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>active</th>\n",
       "      <th>age</th>\n",
       "      <th>alco</th>\n",
       "      <th>ap_hi</th>\n",
       "      <th>ap_lo</th>\n",
       "      <th>cardio</th>\n",
       "      <th>cholesterol</th>\n",
       "      <th>gender</th>\n",
       "      <th>gluc</th>\n",
       "      <th>height</th>\n",
       "      <th>id</th>\n",
       "      <th>smoke</th>\n",
       "      <th>weight</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1.0</td>\n",
       "      <td>18393</td>\n",
       "      <td>0.0</td>\n",
       "      <td>110.0</td>\n",
       "      <td>80.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>168</td>\n",
       "      <td>0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>62.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1.0</td>\n",
       "      <td>20228</td>\n",
       "      <td>0.0</td>\n",
       "      <td>140.0</td>\n",
       "      <td>90.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>156</td>\n",
       "      <td>1</td>\n",
       "      <td>0.0</td>\n",
       "      <td>85.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0.0</td>\n",
       "      <td>18857</td>\n",
       "      <td>0.0</td>\n",
       "      <td>130.0</td>\n",
       "      <td>70.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>165</td>\n",
       "      <td>2</td>\n",
       "      <td>0.0</td>\n",
       "      <td>64.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1.0</td>\n",
       "      <td>17623</td>\n",
       "      <td>0.0</td>\n",
       "      <td>150.0</td>\n",
       "      <td>100.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>169</td>\n",
       "      <td>3</td>\n",
       "      <td>0.0</td>\n",
       "      <td>82.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0.0</td>\n",
       "      <td>17474</td>\n",
       "      <td>0.0</td>\n",
       "      <td>100.0</td>\n",
       "      <td>60.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>156</td>\n",
       "      <td>4</td>\n",
       "      <td>0.0</td>\n",
       "      <td>56.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   active    age  alco  ap_hi  ap_lo  cardio  cholesterol  gender  gluc  \\\n",
       "0     1.0  18393   0.0  110.0   80.0     0.0            1       2     1   \n",
       "1     1.0  20228   0.0  140.0   90.0     1.0            3       1     1   \n",
       "2     0.0  18857   0.0  130.0   70.0     1.0            3       1     1   \n",
       "3     1.0  17623   0.0  150.0  100.0     1.0            1       2     1   \n",
       "4     0.0  17474   0.0  100.0   60.0     0.0            1       1     1   \n",
       "\n",
       "   height  id  smoke  weight  \n",
       "0     168   0    0.0    62.0  \n",
       "1     156   1    0.0    85.0  \n",
       "2     165   2    0.0    64.0  \n",
       "3     169   3    0.0    82.0  \n",
       "4     156   4    0.0    56.0  "
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "full_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "full_df['bmi'] = full_df.weight / ((full_df.height / 100) ** 2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<matplotlib.axes._subplots.AxesSubplot at 0x7f3f4a95ca10>"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAZsAAAD8CAYAAAChHgmuAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvhp/UCwAAGr5JREFUeJzt3X+0XWV95/H3p4mgWDHBBMX8aIKNtOiyY7xgWmuroBDAEjpLp2GckrGxmWXxV52OBOkyHZW1wDqirCo2QmqgSoyIkhEsjYiyZi35EUD5FTC3QMkFNLHhV0WJwc/8sZ9rD+Hcm3Nz73POyc3ntdZZZ+/vfvY+372Tk2+evZ+zt2wTERFR06/1OoGIiJj8UmwiIqK6FJuIiKguxSYiIqpLsYmIiOpSbCIioroUm4iIqC7FJiIiqkuxiYiI6qb2OoFumzFjhufNm9frNCIi9ik333zzT2zP3Nv197tiM2/ePDZt2tTrNCIi9imS/nU86+c0WkREVJdiExER1VUrNpLWSNom6Y7d4u+RdI+kOyV9vCV+pqTBsuz4lvjiEhuUtLIlPl/SDZK2SPqypANq7UtERIxPzZ7NF4DFrQFJbwSWAK+y/QrgEyV+JLAUeEVZ57OSpkiaAnwGOAE4Eji1tAU4FzjP9gLgEWB5xX2JiIhxqFZsbF8H7Ngt/C7gHNtPlTbbSnwJsM72U7bvAwaBo8tr0Pa9tncC64AlkgQcA1xW1l8LnFJrXyIiYny6fc3m5cDry+mv70o6qsRnAVtb2g2V2EjxFwGP2t61WzwiIvpQt4c+TwWmA4uAo4D1kg4H1KataV8MPUr7tiStAFYAzJ07d4wpR0TEeHW7ZzMEXO7GjcAvgRklPqel3WzgoVHiPwGmSZq6W7wt26ttD9gemDlzr3+TFBERe6nbxebrNNdakPRy4ACawrEBWCrpQEnzgQXAjcBNwIIy8uwAmkEEG2wbuBZ4a9nuMuCKru5JRER0rNppNEmXAm8AZkgaAlYBa4A1ZTj0TmBZKRx3SloP3AXsAk63/XTZzruBq4EpwBrbd5aPOANYJ+ljwK3ARbX2ZV8wb+WVz5i//5yTepRJRMSzVSs2tk8dYdF/G6H92cDZbeJXAVe1id9LM1otIiL6XO4gEBER1aXYREREdSk2ERFRXYpNRERUl2ITERHVpdhERER1KTYREVFdik1ERFSXYhMREdWl2ERERHUpNhERUV2KTUREVNfth6dFl7TeBTp3gI6IXkvPJiIiqkuxiYiI6lJsIiKiuhSbiIioLsUmIiKqq1ZsJK2RtE3SHW2W/ZUkS5pR5iXpfEmDkm6TtLCl7TJJW8prWUv8NZJuL+ucL0m19iUiIsanZs/mC8Di3YOS5gBvBh5oCZ8ALCivFcAFpe0hwCrgtcDRwCpJ08s6F5S2w+s967MiIqI/VCs2tq8DdrRZdB7wQcAtsSXAxW5cD0yTdBhwPLDR9g7bjwAbgcVl2cG2v2fbwMXAKbX2JSIixqer12wknQw8aPsHuy2aBWxtmR8qsdHiQ23iI33uCkmbJG3avn37OPYgIiL2RteKjaSDgLOAD7db3CbmvYi3ZXu17QHbAzNnzuwk3YiImEDd7Nm8DJgP/EDS/cBs4BZJL6HpmcxpaTsbeGgP8dlt4hER0Ye6Vmxs3277UNvzbM+jKRgLbf8I2ACcVkalLQIes/0wcDVwnKTpZWDAccDVZdkTkhaVUWinAVd0a18iImJsag59vhT4HnCEpCFJy0dpfhVwLzAIfB74CwDbO4CPAjeV10dKDOBdwIVlnX8BvlljPyIiYvyq3fXZ9ql7WD6vZdrA6SO0WwOsaRPfBLxyfFlGREQ35A4CERFRXYpNRERUl2ITERHVpdhERER1KTYREVFdik1ERFSXYhMREdWl2ERERHUpNhERUV2KTUREVJdiExER1aXYREREdSk2ERFRXYpNRERUl2ITERHVpdhERER1KTYREVFdzcdCr5G0TdIdLbG/lXS3pNskfU3StJZlZ0oalHSPpONb4otLbFDSypb4fEk3SNoi6cuSDqi1LxERMT41ezZfABbvFtsIvNL2q4AfAmcCSDoSWAq8oqzzWUlTJE0BPgOcABwJnFraApwLnGd7AfAIsLzivkRExDhUKza2rwN27Bb7Z9u7yuz1wOwyvQRYZ/sp2/cBg8DR5TVo+17bO4F1wBJJAo4BLivrrwVOqbUvERExPr28ZvNnwDfL9Cxga8uyoRIbKf4i4NGWwjUcb0vSCkmbJG3avn37BKUfERGd6kmxkXQWsAv44nCoTTPvRbwt26ttD9gemDlz5ljTjYiIcZra7Q+UtAx4C3Cs7eECMQTMaWk2G3ioTLeL/wSYJmlq6d20to+IiD7T1WIjaTFwBvCHtp9sWbQB+JKkTwIvBRYAN9L0YBZImg88SDOI4L/atqRrgbfSXMdZBlzRvT3Zt8xbeeWvpu8/56QeZhIR+6uaQ58vBb4HHCFpSNJy4O+AFwAbJX1f0ucAbN8JrAfuAv4JON3206XX8m7gamAzsL60haZofUDSIM01nItq7UtERIxPtZ6N7VPbhEcsCLbPBs5uE78KuKpN/F6a0WoREdHncgeBiIioLsUmIiKqS7GJiIjqUmwiIqK6FJuIiKguxSYiIqpLsYmIiOpSbCIioroUm4iIqC7FJiIiqkuxiYiI6lJsIiKiuhSbiIioLsUmIiKqS7GJiIjquv5Y6OitPLUzInohPZuIiKiu5mOh10jaJumOltghkjZK2lLep5e4JJ0vaVDSbZIWtqyzrLTfImlZS/w1km4v65wvSbX2JSIixqdmz+YLwOLdYiuBa2wvAK4p8wAnAAvKawVwATTFCVgFvJbmEdCrhgtUabOiZb3dPysiIvpER8VG0ivHumHb1wE7dgsvAdaW6bXAKS3xi924Hpgm6TDgeGCj7R22HwE2AovLsoNtf8+2gYtbthUREX2m057N5yTdKOkvJE0bx+e92PbDAOX90BKfBWxtaTdUYqPFh9rEIyKiD3VUbGz/PvB2YA6wSdKXJL15AvNod73FexFvv3FphaRNkjZt3759L1OMiIi91fE1G9tbgL8GzgD+EDhf0t2S/vMYPu/H5RQY5X1biQ/RFLJhs4GH9hCf3SY+Uu6rbQ/YHpg5c+YY0o2IiInQ6TWbV0k6D9gMHAP8ke3fLtPnjeHzNgDDI8qWAVe0xE8ro9IWAY+V02xXA8dJml4GBhwHXF2WPSFpURmFdlrLtiIios90+qPOvwM+D3zI9s+Gg7YfkvTX7VaQdCnwBmCGpCGaUWXnAOslLQceAN5Wml8FnAgMAk8C7yjb3yHpo8BNpd1HbA8POngXzYi35wHfLK+IiOhDnRabE4Gf2X4aQNKvAc+1/aTtS9qtYPvUEbZ1bJu2Bk4fYTtrgDVt4puAMY+Si4iI7uv0ms23aHoQww4qsYiIiD3qtNg81/a/D8+U6YPqpBQREZNNp8Xmp7vdQuY1wM9GaR8REfErnV6zeT/wFUnDw4sPA/6kTkoRETHZdFRsbN8k6beAI2h+UHm37V9UzSwiIiaNsTzP5ihgXlnn1ZKwfXGVrCIiYlLpqNhIugR4GfB94OkSHr4BZkRExKg67dkMAEeW38NERESMSaej0e4AXlIzkYiImLw67dnMAO6SdCPw1HDQ9slVsoqIiEml02LzNzWTiIiIya3Toc/flfQbwALb35J0EDClbmoRETFZdPqIgT8HLgP+voRmAV+vlVREREwunQ4QOB14HfA4/OpBaoeOukZERETRabF5yvbO4RlJUxnlMcwRERGtOi0235X0IeB5kt4MfAX4v/XSioiIyaTTYrMS2A7cDvwPmidrtn1CZ0RExO46HY32S5rHQn9+Ij5U0l8C76Q5FXc7zWOgDwPWAYcAtwB/anunpANpbovzGuDfgD+xfX/ZzpnAcppb6LzX9tUTkd++Yt7KK3udQkRERzodjXafpHt3f+3NB0qaBbwXGLD9Spoh1EuBc4HzbC8AHqEpIpT3R2z/JnBeaYekI8t6rwAWA5+VlOHYERF9qNPTaAM0d30+Cng9cD7wj+P43Kk013+m0jzx82HgGJrh1QBrgVPK9JIyT1l+rCSV+DrbT9m+DxgEjh5HThERUUlHxcb2v7W8HrT9KZriMGa2HwQ+ATxAU2QeA24GHrW9qzQbovktD+V9a1l3V2n/otZ4m3UiIqKPdPqIgYUts79G09N5wd58oKTpNL2S+cCjNCPbTmjTdHhotUZYNlK83WeuAFYAzJ07d4wZR0TEeHV6b7T/0zK9C7gf+C97+ZlvAu6zvR1A0uXA7wHTJE0tvZfZwPAjqIeAOcBQOe32QmBHS3xY6zrPYHs1sBpgYGAgvw+KiOiyTkejvXECP/MBYFG5v9rPgGOBTcC1wFtpRqQtA64o7TeU+e+V5d+2bUkbgC9J+iTwUmABcOME5hkREROk09NoHxhtue1PdvqBtm+QdBnN8OZdwK00vY4rgXWSPlZiF5VVLgIukTRI06NZWrZzp6T1wF1lO6fbfpqIiOg7Y3lS51E0vQyAPwKu45kX6DtmexWwarfwvbQZTWb758DbRtjO2cDZe5NDRER0z1genrbQ9hMAkv4G+Irtd9ZKLOpr/VHo/eec1MNMImKy6/R3NnOBnS3zO4F5E55NRERMSp32bC4BbpT0NZrhxX9McwuZiIiIPep0NNrZkr5Jc/cAgHfYvrVeWhERMZl0ehoNmtvKPG770zS/eZlfKaeIiJhkOr0R5yrgDODMEnoO47s3WkRE7Ec67dn8MXAy8FMA2w+xl7eriYiI/U+nxWanbVPuPSbp+fVSioiIyabTYrNe0t/T3L/sz4FvMUEPUouIiMmv09Fon5D0ZuBx4Ajgw7Y3Vs0sIiImjT0Wm/L0y6ttvwlIgYmIiDHb42m0cnPLJyW9sAv5RETEJNTpHQR+DtwuaSNlRBqA7fdWySq6LvdJi4iaOi02V5ZXRETEmI1abCTNtf2A7bXdSigiIiafPV2z+frwhKSvVs4lIiImqT0VG7VMH14zkYiImLz2VGw8wvS4SJom6TJJd0vaLOl3JR0iaaOkLeV9emkrSedLGpR0m6SFLdtZVtpvkbRsovKLiIiJtadi8zuSHpf0BPCqMv24pCckPT6Oz/008E+2fwv4HWAzsBK4xvYC4JoyD3ACsKC8VgAXAEg6hObR0q+leZz0quECFRER/WXUYmN7iu2Dbb/A9tQyPTx/8N58oKSDgT8ALiqfsdP2o8ASYHggwlrglDK9BLjYjetpbplzGHA8sNH2DtuP0PzgdPHe5BQREXWN5Xk2E+VwYDvwD5JulXRhubHni20/DFDeDy3tZwFbW9YfKrGR4hER0Wd6UWymAguBC2y/muZHoitHaa82MY8Sf/YGpBWSNknatH379rHmGxER49SLYjMEDNm+ocxfRlN8flxOj1Het7W0n9Oy/mzgoVHiz2J7te0B2wMzZ86csB2JiIjOdL3Y2P4RsFXSESV0LHAXsAEYHlG2DLiiTG8ATiuj0hYBj5XTbFcDx0maXgYGHFdiERHRZzq9Xc1Eew/wRUkHAPcC76ApfOslLQceAN5W2l4FnAgMAk+WttjeIemjwE2l3Uds7+jeLkRERKd6Umxsfx8YaLPo2DZtDZw+wnbWAGsmNruIiJhovbhmExER+5kUm4iIqC7FJiIiqkuxiYiI6lJsIiKiuhSbiIioLsUmIiKqS7GJiIjqenUHgehj81Ze+avp+885qYeZRMRkkZ5NRERUl2ITERHVpdhERER1KTYREVFdik1ERFSXYhMREdWl2ERERHUpNhERUV2KTUREVNezYiNpiqRbJX2jzM+XdIOkLZK+LOmAEj+wzA+W5fNatnFmid8j6fje7ElEROxJL3s27wM2t8yfC5xnewHwCLC8xJcDj9j+TeC80g5JRwJLgVcAi4HPSprSpdwjImIMelJsJM0GTgIuLPMCjgEuK03WAqeU6SVlnrL82NJ+CbDO9lO27wMGgaO7swcRETEWvboR56eADwIvKPMvAh61vavMDwGzyvQsYCuA7V2SHivtZwHXt2yzdZ1nkLQCWAEwd+7ciduLHmi9SWZExL6i6z0bSW8Bttm+uTXcpqn3sGy0dZ4ZtFfbHrA9MHPmzDHlGxER49eLns3rgJMlnQg8FziYpqczTdLU0ruZDTxU2g8Bc4AhSVOBFwI7WuLDWteJiIg+0vWeje0zbc+2PY/mAv+3bb8duBZ4a2m2DLiiTG8o85Tl37btEl9aRqvNBxYAN3ZpNyIiYgz66eFpZwDrJH0MuBW4qMQvAi6RNEjTo1kKYPtOSeuBu4BdwOm2n+5+2pNbHqQWEROhp8XG9neA75Tpe2kzmsz2z4G3jbD+2cDZ9TKMiIiJkDsIREREdSk2ERFRXYpNRERUl2ITERHVpdhERER1KTYREVFdik1ERFSXYhMREdWl2ERERHX9dLua6HO5dU1E7K30bCIioroUm4iIqC7FJiIiqkuxiYiI6lJsIiKiuhSbiIioLsUmIiKq63qxkTRH0rWSNku6U9L7SvwQSRslbSnv00tcks6XNCjpNkkLW7a1rLTfImlZt/clIiI604uezS7gf9r+bWARcLqkI4GVwDW2FwDXlHmAE4AF5bUCuACa4gSsAl5L8zjpVcMFKiIi+kvXi43th23fUqafADYDs4AlwNrSbC1wSpleAlzsxvXANEmHAccDG23vsP0IsBFY3MVdiYiIDvX0mo2kecCrgRuAF9t+GJqCBBxams0CtrasNlRiI8UjIqLP9KzYSPp14KvA+20/PlrTNjGPEm/3WSskbZK0afv27WNPNiIixqUnN+KU9ByaQvNF25eX8I8lHWb74XKabFuJDwFzWlafDTxU4m/YLf6ddp9nezWwGmBgYKBtQYqxyU05I2IsejEaTcBFwGbbn2xZtAEYHlG2DLiiJX5aGZW2CHisnGa7GjhO0vQyMOC4EouIiD7Ti57N64A/BW6X9P0S+xBwDrBe0nLgAeBtZdlVwInAIPAk8A4A2zskfRS4qbT7iO0d3dmFiIgYi64XG9v/j/bXWwCObdPewOkjbGsNsGbisutPraesIiL2RbmDQEREVJdiExER1eWx0DFuGZkWEXuSnk1ERFSXYhMREdWl2ERERHUpNhERUV2KTUREVJdiExER1WXoc0yoDIOOiHbSs4mIiOpSbCIiorqcRotqckotIoal2PSp3Ok5IiaTnEaLiIjq0rOJrsgptYj9W3o2ERFR3T7fs5G0GPg0MAW40PY5PU4p9iC9nIj9zz5dbCRNAT4DvBkYAm6StMH2Xb3NLDqVwhOxf9iniw1wNDBo+14ASeuAJcA+WWz29xFoI+1/ilDEvm9fLzazgK0t80PAa3uUy17Z3wtMJ8Z6jFKcIvrPvl5s1CbmZzWSVgAryuy/S7pnlG3OAH4yAbnV1O859jQ/ndtRs34/hpAcJ0pyHL8ZwG+MZwP7erEZAua0zM8GHtq9ke3VwOpONihpk+2BiUmvjn7Psd/zg+Q4UZLjxOj3HEt+88azjX196PNNwAJJ8yUdACwFNvQ4p4iI2M0+3bOxvUvSu4GraYY+r7F9Z4/TioiI3ezTxQbA9lXAVRO4yY5Ot/VYv+fY7/lBcpwoyXFi9HuO485P9rOup0dEREyoff2aTURE7ANSbApJiyXdI2lQ0spe5wMgaY6kayVtlnSnpPeV+CGSNkraUt6n90GuUyTdKukbZX6+pBtKjl8uAzh6md80SZdJurscz9/tt+Mo6S/Ln/Mdki6V9NxeH0dJayRtk3RHS6ztcVPj/PIduk3Swh7l97flz/k2SV+TNK1l2Zklv3skHV87v5FybFn2V5IsaUaZ7/oxHC1HSe8px+pOSR9viY/9ONre7180gwv+BTgcOAD4AXBkH+R1GLCwTL8A+CFwJPBxYGWJrwTO7YNcPwB8CfhGmV8PLC3TnwPe1eP81gLvLNMHANP66TjS/ED5PuB5Lcfvv/f6OAJ/ACwE7miJtT1uwInAN2l+/7YIuKFH+R0HTC3T57bkd2T5bh8IzC/f+Sm9yLHE59AMbvpXYEavjuEox/GNwLeAA8v8oeM5junZNH512xvbO4Hh2970lO2Hbd9Spp8ANtP8o7SE5h9PyvspvcmwIWk2cBJwYZkXcAxwWWnS0xwlHUzzZboIwPZO24/SZ8eRZsDO8yRNBQ4CHqbHx9H2dcCO3cIjHbclwMVuXA9Mk3RYt/Oz/c+2d5XZ62l+fzec3zrbT9m+Dxik+e5XNcIxBDgP+CDP/CF614/hKDm+CzjH9lOlzbaWHMd8HFNsGu1uezOrR7m0JWke8GrgBuDFth+GpiABh/YuMwA+RfOl+WWZfxHwaMsXvtfH83BgO/AP5VTfhZKeTx8dR9sPAp8AHqApMo8BN9Nfx3HYSMetH79Hf0bTU4A+yk/SycCDtn+w26K+yRF4OfD6chr3u5KOKvG9yjHFptHRbW96RdKvA18F3m/78V7n00rSW4Bttm9uDbdp2svjOZXmFMEFtl8N/JTm9E/fKNc9ltCclngp8HzghDZN++bvZRt99ecu6SxgF/DF4VCbZl3PT9JBwFnAh9stbhPr1TGcCkynOZ33v4D15azFXuWYYtPo6LY3vSDpOTSF5ou2Ly/hHw93rcv7tpHW74LXASdLup/m9OMxND2daeV0EPT+eA4BQ7ZvKPOX0RSffjqObwLus73d9i+Ay4Hfo7+O47CRjlvffI8kLQPeArzd5UID/ZPfy2j+U/GD8r2ZDdwi6SX0T46UXC4vp/RupDlzMYO9zDHFptGXt70p/4u4CNhs+5MtizYAy8r0MuCKbuc2zPaZtme7uW/SUuDbtt8OXAu8tTTrdY4/ArZKOqKEjqV5DEXfHEea02eLJB1U/tyHc+yb49hipOO2ATitjKhaBDw2fLqtm9Q8UPEM4GTbT7Ys2gAslXSgpPnAAuDGbudn+3bbh9qeV743QzQDgX5EnxzD4us0/3lE0stpBtb8hL09jt0Y6bAvvGhGgfyQZmTFWb3Op+T0+zTd09uA75fXiTTXRK4BtpT3Q3qda8n3DfzHaLTDy1/AQeArlBEtPcztPwGbyrH8Os3pgb46jsD/Bu4G7gAuoRnt09PjCFxKcw3pFzT/KC4f6bjRnF75TPkO3Q4M9Ci/QZprCsPfmc+1tD+r5HcPcEKvjuFuy+/nP0ajdf0YjnIcDwD+sfx9vAU4ZjzHMXcQiIiI6nIaLSIiqkuxiYiI6lJsIiKiuhSbiIioLsUmIiKqS7GJiIjqUmwiIqK6FJuIiKju/wObCgq06H4doQAAAABJRU5ErkJggg==\n",
      "text/plain": [
       "<Figure size 432x288 with 1 Axes>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "full_df.bmi.plot.hist(100)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#  Split back"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_df = full_df[~full_df.cardio.isnull()]\n",
    "test_df = full_df[full_df.cardio.isnull()].drop('cardio', axis=1)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "##  Model train"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "columns_with_nulls = test_df.columns[test_df.isnull().sum(axis=0) > 0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "X = train_df.drop(['id','cardio'], axis=1)\n",
    "y = train_df.cardio"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": [
    "X_test = test_df.drop(['id'], axis=1)\n",
    "X_test = X_test.fillna(-999)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "0dca2da34210483aac5b3be87d6b89b6",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "HBox(children=(IntProgress(value=0), HTML(value=u'')))"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    }
   ],
   "source": [
    "models = []\n",
    "\n",
    "np.random.seed(42)\n",
    "for i in tqdm.tqdm_notebook(range(100)):\n",
    "    X_train = X.copy()\n",
    "    for col in columns_with_nulls:\n",
    "        _idx = np.random.choice(X_train.index, size=X_train.shape[0]//10, replace=False)\n",
    "        X_train.loc[_idx, col] = -999\n",
    "    \n",
    "    model = CatBoostClassifier(\n",
    "        iterations=1000,\n",
    "        depth=6,\n",
    "        thread_count=12,\n",
    "        border_count=128,\n",
    "        learning_rate=0.015,\n",
    "        random_seed=np.random.randint(10**10),\n",
    "        logging_level='Silent'\n",
    "    )\n",
    "    \n",
    "    model.fit(X_train, y)\n",
    "    models.append(model.copy())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "997f38f7e1244754afa9a30ad9577651",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "HBox(children=(IntProgress(value=0, max=10), HTML(value=u'')))"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    }
   ],
   "source": [
    "models2 = []\n",
    "\n",
    "np.random.seed(42)\n",
    "for i in tqdm.tqdm_notebook(range(10)):\n",
    "    X_train = X.copy()\n",
    "    for col in columns_with_nulls:\n",
    "        _idx = np.random.choice(X_train.index, size=X_train.shape[0]//10, replace=False)\n",
    "        X_train.loc[_idx, col] = -999\n",
    "    \n",
    "    model = lightgbm.LGBMClassifier(\n",
    "        seed=np.random.randint(10**10),\n",
    "        n_estimators=3680,\n",
    "        max_depth=15 + np.random.randint(0,10),\n",
    "        num_leaves=20 + np.random.randint(0,10),\n",
    "        subsample=0.99868,\n",
    "        colsample_bytree=0.8022,\n",
    "        reg_alpha=26.4310,\n",
    "        reg_lambda=19.7836,\n",
    "        max_bin=8850,\n",
    "        objective='binary',\n",
    "        nthread=12\n",
    "    )\n",
    "    model.fit(X_train, y)\n",
    "    models2.append(model)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "38940c736c9f4b9c8a099cf9c285c50b",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "HBox(children=(IntProgress(value=0), HTML(value=u'')))"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    }
   ],
   "source": [
    "predictions = []\n",
    "for _model in tqdm.tqdm_notebook(models):\n",
    "    predictions.append(_model.predict_proba(X_test)[:,1])\n",
    "    \n",
    "predictions = np.vstack(predictions).T"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "5ce6cc52814e42fda3d91dde5e5bd243",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "HBox(children=(IntProgress(value=0, max=10), HTML(value=u'')))"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    }
   ],
   "source": [
    "predictions2 = []\n",
    "for _model in tqdm.tqdm_notebook(models2):\n",
    "    predictions2.append(_model.predict_proba(X_test)[:,1])\n",
    "    \n",
    "predictions2 = np.vstack(predictions2).T"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [],
   "source": [
    "prediction = np.hstack([predictions, predictions2]).mean(axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [],
   "source": [
    "pd.DataFrame(prediction).to_csv(\n",
    "    'submission_{date}.csv'.format(\n",
    "        date=datetime.today().strftime('%Y%m%d_%H%M')\n",
    "    ),\n",
    "    index=False, header=False\n",
    ")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.15"
  },
  "widgets": {
   "state": {
    "298684729f264868a07fbdc28fbb451b": {
     "views": [
      {
       "cell_index": 40
      }
     ]
    },
    "457c04007e0247058f58de51d5922b0c": {
     "views": [
      {
       "cell_index": 31
      }
     ]
    },
    "8ea30877ace841dd80b984ddc27368bd": {
     "views": [
      {
       "cell_index": 38
      }
     ]
    },
    "a6639a7ea13c4d5dbc6a4e5fd346e2c0": {
     "views": [
      {
       "cell_index": 38
      }
     ]
    },
    "f468a7b4d4944cb7984d10fb5e424f9b": {
     "views": [
      {
       "cell_index": 41
      }
     ]
    }
   },
   "version": "1.2.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
